// CuNNy 8x4C BILINEAR RGB NVL DN - https://github.com/funnyplanter/CuNNy

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// 
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// 
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
// 
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-DN-D04N08

//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!SAMPLER
//!FILTER POINT
SamplerState SP;

//!SAMPLER
//!FILTER LINEAR
SamplerState SL;

//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 min16float4
#define M4 min16float4x4

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;

//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0

#define l0(x, y) min16float((dot(float3(-1.880e-01, -3.696e-01, -8.936e-02), O(INPUT, float2(x, y)).rgb) + 5.137e-01))

V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
	V4 r = 0.0;
	r += V4(6.049e-03, -3.524e-01, -1.308e-01, -6.691e-02) * s0_0;
	r += V4(1.720e-02, -7.092e-02, -3.030e-01, 1.654e-01) * s0_1;
	r += V4(-6.706e-03, 2.289e-01, 1.982e-03, -5.756e-02) * s0_2;
	r += V4(-2.761e-02, 5.050e-01, -2.036e-01, 1.265e-01) * s0_3;
	r += V4(-8.654e-01, -6.035e-01, -2.119e-01, 5.055e-01) * s0_4;
	r += V4(-7.114e-03, 2.325e-02, 5.721e-02, 4.585e-02) * s0_5;
	r += V4(2.796e-01, 1.680e-01, 1.353e-01, 1.286e-02) * s0_6;
	r += V4(5.684e-01, 3.022e-01, 6.426e-01, 8.931e-02) * s0_7;
	r += V4(3.723e-02, -2.036e-01, 2.732e-02, -4.101e-02) * s0_8;
	r += V4(1.324e-02, -9.379e-05, 8.452e-03, 5.165e-02);
	return r;
}

void Pass1(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	min16float s0_0 = l0(-1.0, -1.0);
	min16float s0_1 = l0(0.0, -1.0);
	min16float s0_2 = l0(1.0, -1.0);
	min16float s0_3 = l0(-1.0, 0.0);
	min16float s0_4 = l0(0.0, 0.0);
	min16float s0_5 = l0(1.0, 0.0);
	min16float s0_6 = l0(-1.0, 1.0);
	min16float s0_7 = l0(0.0, 1.0);
	min16float s0_8 = l0(1.0, 1.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}

//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1

#define l0(x, y) V4(O(t0, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(2.216e-02, 1.062e-01, -3.433e-03, -1.923e-01, 6.300e-02, -4.594e-01, 2.025e-01, 8.655e-03, -5.497e-02, 1.694e-01, -1.806e-01, 2.115e-01, -6.176e-02, 1.167e-02, -5.987e-02, 1.167e-01));
	r += mul(s0_1, M4(-1.646e-01, -5.524e-01, -1.352e-01, 1.704e-01, 3.398e-02, -2.598e-01, 1.616e-01, -1.772e-01, -5.648e-02, 2.755e-01, 2.638e-02, -2.657e-02, 3.774e-02, -6.833e-02, -1.141e-01, -2.438e-01));
	r += mul(s0_2, M4(-1.459e-01, 9.939e-02, -6.457e-04, 2.352e-02, 5.006e-02, -7.759e-01, -4.862e-02, -3.366e-02, 9.508e-02, 1.537e-01, -6.771e-02, -1.260e-01, 1.067e-01, -5.893e-02, -9.811e-02, -1.060e-02));
	r += mul(s0_3, M4(-2.901e-01, 2.907e-01, 2.178e-01, -3.877e-01, 9.034e-03, 8.718e-03, -1.213e-01, 9.252e-02, 3.286e-01, -8.247e-02, -5.573e-02, -3.852e-01, -1.371e-01, 1.877e-01, 2.337e-01, 5.324e-01));
	r += mul(s0_4, M4(-9.182e-01, 1.013e-01, 2.969e-01, 7.117e-01, -2.367e-01, -7.128e-02, 1.828e-01, 5.993e-01, -2.965e-01, 1.323e-01, 3.117e-02, -3.215e-01, -1.410e-01, 5.359e-02, -1.137e-01, -2.603e-01));
	r += mul(s0_5, M4(-1.071e-01, -8.801e-02, 9.524e-03, -2.937e-02, 7.723e-02, 1.195e-01, -9.056e-02, 6.161e-02, 1.962e-01, -2.740e-01, -9.418e-02, 1.141e-01, 6.203e-02, -1.084e-01, 2.402e-01, -2.066e-01));
	r += mul(s0_6, M4(2.226e-01, -2.259e-01, -2.499e-02, -9.184e-02, -1.499e-01, -3.737e-02, 1.576e-01, 1.084e-01, -2.221e-01, -1.080e-02, 2.643e-02, -1.023e-01, 1.068e-01, 1.193e-01, -2.781e-01, 3.396e-01));
	r += mul(s0_7, M4(7.520e-01, -1.043e-01, -4.535e-02, 2.775e-01, 1.577e-01, -1.526e-01, 1.796e-01, 1.085e-01, -1.012e+00, 4.333e-02, 1.270e-02, -1.692e-01, 1.127e-01, -2.847e-01, -1.784e-01, -3.956e-01));
	r += mul(s0_8, M4(2.206e-01, 1.370e-01, -7.453e-02, 1.050e-01, 8.412e-02, -1.396e-01, 1.707e-02, -1.654e-02, -2.116e-01, -7.944e-02, 1.244e-01, -6.709e-02, -5.577e-02, 1.619e-01, -2.818e-01, 1.460e-01));
	r += mul(s1_0, M4(1.180e-01, -2.345e-01, 5.406e-02, -1.102e-01, 1.559e-02, -3.865e-01, -1.077e-01, 1.442e-02, -1.405e-01, 1.578e-01, -3.338e-02, 1.157e-01, -1.676e-01, 4.656e-02, -1.507e-01, 2.590e-02));
	r += mul(s1_1, M4(-3.112e-02, -5.537e-01, -3.626e-01, -2.915e-01, 7.495e-02, 4.473e-01, -1.847e-01, -8.743e-02, -3.290e-02, 3.660e-02, 1.252e-01, 1.058e-02, 1.193e-01, 6.421e-02, -1.456e-01, -1.693e-01));
	r += mul(s1_2, M4(-1.047e-01, -4.306e-01, 6.486e-03, 1.137e-01, 2.935e-02, -3.608e-01, 5.242e-02, -2.374e-02, 1.130e-01, -4.864e-02, -7.302e-02, -2.205e-02, 8.227e-02, -8.403e-02, -9.468e-02, 8.095e-02));
	r += mul(s1_3, M4(-3.759e-02, 2.709e-01, 1.269e-01, -4.994e-01, -1.577e-02, 1.871e-01, -2.532e-01, 8.960e-02, 2.298e-01, -2.462e-01, -1.634e-02, -3.955e-01, 2.750e-02, -4.812e-02, -2.441e-01, 9.926e-01));
	r += mul(s1_4, M4(-7.288e-01, 5.644e-01, 1.042e+00, 6.160e-01, -4.271e-01, 4.419e-01, 1.437e-01, 3.840e-01, -1.220e-01, -8.627e-01, 6.664e-02, -1.220e-02, 5.260e-02, 1.505e-01, -2.182e-01, -6.116e-01));
	r += mul(s1_5, M4(1.659e-01, 2.566e-01, -5.954e-02, -9.187e-02, -8.251e-02, 1.091e-01, -1.506e-01, 1.370e-01, 3.056e-01, -3.512e-01, -4.956e-03, 7.008e-02, 1.320e-01, -3.995e-01, -8.603e-03, -3.542e-01));
	r += mul(s1_6, M4(2.549e-01, -7.946e-02, -1.755e-01, -2.902e-02, -1.912e-01, 2.349e-01, 6.770e-02, 9.683e-02, -2.690e-01, -1.715e-01, 5.692e-02, -1.064e-01, 2.998e-01, 7.619e-02, 8.040e-03, 2.706e-01));
	r += mul(s1_7, M4(7.320e-01, 1.397e-01, -5.600e-02, 9.609e-02, -1.267e-01, 6.841e-02, 2.429e-01, 3.167e-02, -6.816e-01, -3.313e-03, 5.622e-02, -4.727e-02, -3.420e-01, 4.283e-02, -3.250e-01, -4.118e-01));
	r += mul(s1_8, M4(1.607e-01, 1.581e-01, -6.049e-02, 9.118e-02, -1.583e-02, 2.918e-01, 1.703e-02, -1.206e-01, -2.114e-01, -1.248e-01, 6.689e-02, -2.131e-02, -7.779e-02, 1.069e-01, -1.181e-01, 2.230e-01));
	r += V4(1.959e-02, -5.807e-03, 9.415e-02, 7.247e-03);
	return r;
}

void Pass2(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0

#define l0(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(9.727e-02, 1.849e-01, 2.125e-02, 1.933e-01, 9.183e-02, 8.307e-03, -9.035e-02, 3.241e-02, 1.141e-01, 8.739e-02, -9.547e-02, 1.616e-01, 2.912e-02, -1.780e-02, 5.433e-02, 2.720e-02));
	r += mul(s0_1, M4(-1.524e-01, -9.138e-02, 8.798e-02, -1.691e-01, 8.519e-03, 3.597e-02, -1.784e-02, 3.049e-02, 3.078e-02, 1.823e-01, 1.051e-02, -5.317e-02, -1.977e-01, 1.013e-01, 1.215e-01, 4.261e-02));
	r += mul(s0_2, M4(-1.992e-02, -1.191e-01, -1.365e-03, 3.976e-02, 3.452e-03, 7.503e-03, 4.850e-03, 8.970e-03, -7.652e-03, 1.166e-01, 9.888e-02, 3.423e-03, -3.354e-01, -3.335e-01, -2.226e-02, -1.509e-01));
	r += mul(s0_3, M4(-7.994e-02, 1.374e-01, -1.701e-02, -2.530e-01, 2.153e-01, -6.957e-03, -1.405e-01, -6.175e-02, 7.274e-03, 1.734e-01, -9.107e-02, -1.303e-01, -1.265e-01, 1.669e-02, 3.494e-02, -8.377e-02));
	r += mul(s0_4, M4(-1.124e+00, 1.355e-02, -1.979e-01, -4.092e-01, -1.276e-01, -1.096e-01, 5.949e-02, 1.073e-01, -4.780e-02, 1.378e-01, 1.905e-01, -9.525e-02, -5.999e-01, 1.274e-01, 8.416e-01, 2.483e-01));
	r += mul(s0_5, M4(3.312e-01, 2.036e-01, -5.231e-02, 5.357e-02, 1.666e-03, -2.102e-03, -3.213e-03, 4.747e-02, 1.130e-01, 3.492e-01, -1.263e-01, 4.100e-01, -5.859e-01, 4.875e-02, 2.227e-01, 3.127e-01));
	r += mul(s0_6, M4(-3.699e-02, 6.066e-02, 3.448e-03, -4.158e-03, -4.048e-03, -3.619e-02, -8.830e-02, -8.917e-03, 2.990e-02, 6.919e-03, 9.803e-02, 2.188e-02, 5.674e-02, -3.122e-02, -6.793e-02, 8.573e-02));
	r += mul(s0_7, M4(-1.255e-01, 1.754e-01, -1.332e-01, -1.124e-01, -2.163e-01, 1.552e-02, -7.485e-04, 4.194e-02, -1.899e-01, 1.334e-01, -1.721e-01, -3.487e-01, 3.847e-01, -3.823e-02, 1.121e-02, -7.128e-02));
	r += mul(s0_8, M4(7.152e-02, -1.631e-02, 4.810e-02, 1.435e-01, 3.881e-02, -3.596e-02, -7.544e-03, -1.071e-01, -8.509e-02, 1.110e-01, 8.542e-02, 1.980e-02, -1.134e-01, -7.967e-02, -1.586e-01, 2.511e-01));
	r += mul(s1_0, M4(2.326e-01, 4.791e-02, -1.996e-01, 1.352e-02, -9.909e-03, 1.117e-01, 2.198e-02, -6.683e-02, 1.356e-01, 2.830e-01, -8.418e-02, 2.137e-01, -1.401e-02, -7.056e-02, 5.360e-02, 6.243e-02));
	r += mul(s1_1, M4(7.739e-01, -3.172e-01, -2.031e-01, 2.054e-01, -1.263e-01, -7.571e-03, 8.090e-02, -1.372e-01, 1.053e-01, 2.982e-01, -6.235e-02, 1.452e-02, 1.973e-01, 9.233e-02, -1.067e-01, 1.088e-01));
	r += mul(s1_2, M4(-1.136e-01, -1.332e-01, -7.369e-02, 2.046e-01, -9.302e-02, 2.722e-02, 9.461e-02, -1.895e-01, 1.216e-02, 2.595e-01, 1.028e-01, 8.413e-02, -1.339e-01, -2.259e-01, -1.047e-01, 5.994e-02));
	r += mul(s1_3, M4(1.224e-01, -3.713e-02, -2.383e-01, -1.743e-01, -1.876e-01, 1.155e-01, 2.212e-01, -1.375e-01, 1.618e-01, 2.628e-01, -1.161e-01, -1.826e-01, 8.003e-02, -1.961e-02, -6.278e-02, -5.710e-02));
	r += mul(s1_4, M4(-2.647e-01, -1.603e-01, -7.731e-01, 1.958e-01, -4.093e-01, -1.110e-01, 3.352e-01, -3.093e-02, -6.201e-01, 3.073e-01, 3.779e-01, -2.733e-01, 4.035e-01, 1.230e-01, -1.606e-01, 9.421e-02));
	r += mul(s1_5, M4(1.981e-01, -8.801e-03, -9.874e-03, -4.003e-02, 2.686e-03, -1.346e-01, -1.813e-02, -1.003e-01, 1.561e-01, 3.252e-01, -1.189e-01, 2.014e-01, 1.343e-01, 4.088e-02, -9.918e-02, 1.025e+00));
	r += mul(s1_6, M4(-2.323e-02, 3.284e-02, -5.099e-03, -3.025e-02, -1.458e-02, -1.640e-02, 1.268e-01, -3.787e-02, 5.078e-02, 4.529e-02, 1.050e-02, -8.079e-03, -1.530e-02, -6.509e-02, -1.620e-01, 6.662e-02));
	r += mul(s1_7, M4(3.972e-02, 8.570e-02, -8.723e-02, -3.746e-02, -1.902e-01, 5.121e-02, 1.161e-01, -4.624e-02, -6.268e-02, 1.852e-01, -1.535e-01, -2.023e-01, 2.476e-01, -2.211e-02, -1.590e-01, -3.109e-02));
	r += mul(s1_8, M4(-8.025e-03, -4.798e-02, 5.162e-02, 6.616e-02, -2.416e-02, -5.815e-02, -1.334e-02, -1.029e-01, 5.381e-02, 1.539e-01, 4.511e-02, 1.426e-01, -5.511e-02, -9.311e-02, -3.072e-02, 1.572e-01));
	r += V4(3.240e-02, -1.989e-01, -2.700e-02, 6.578e-03);
	return r;
}

void Pass3(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1

#define l0(x, y) V4(O(t0, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(9.384e-02, 1.183e-01, 5.136e-02, -4.583e-01, -1.060e-01, 6.124e-02, -1.479e-01, -2.457e-01, -5.881e-02, 4.756e-03, -2.540e-02, -5.047e-02, -1.897e-01, 4.062e-02, 1.226e-02, 1.465e-01));
	r += mul(s0_1, M4(-1.890e-01, -9.535e-02, 2.627e-01, 3.224e-01, 1.050e-01, -3.922e-02, -3.551e-01, -2.632e-01, -2.349e-01, -5.605e-02, -2.856e-01, 4.331e-01, -2.614e-02, -6.027e-02, -3.236e-02, 2.873e-01));
	r += mul(s0_2, M4(-1.702e-01, 7.462e-02, 2.168e-01, 4.212e-01, 8.150e-03, 6.671e-02, -2.781e-01, -1.322e-01, -3.933e-02, 2.698e-02, -3.420e-01, -1.116e-02, -1.788e-02, 8.701e-03, -1.044e-01, 1.264e-01));
	r += mul(s0_3, M4(3.573e-01, -4.592e-02, 4.539e-01, 2.854e-01, -6.463e-01, -1.763e-01, 6.236e-01, 7.125e-02, 4.126e-01, -1.621e-02, 1.685e-02, 2.328e-01, -5.456e-01, -2.113e-01, 1.424e-01, 1.414e-01));
	r += mul(s0_4, M4(3.838e-01, -1.008e+00, 4.023e-01, 1.302e+00, -1.503e-01, 4.245e-02, 1.496e+00, -3.479e-01, -3.763e-01, -7.877e-01, 4.081e-01, -2.192e-01, -2.853e-01, 2.123e-01, -3.407e-01, 2.423e-01));
	r += mul(s0_5, M4(5.073e-03, -2.123e-01, 1.851e-01, 1.482e-01, -2.814e-01, 1.262e-01, 6.890e-01, -2.317e-01, 6.427e-02, -5.801e-02, -3.684e-02, 7.526e-02, 1.309e-02, -2.125e-02, -7.760e-02, 4.795e-02));
	r += mul(s0_6, M4(1.409e-01, -1.062e-01, 1.665e-01, 5.277e-01, 6.676e-01, -1.872e-01, 1.251e+00, 1.165e-01, -2.287e-02, -5.235e-02, -2.028e-03, -3.305e-02, -1.968e-01, 1.898e-01, -9.538e-02, -1.418e-01));
	r += mul(s0_7, M4(7.353e-02, -3.073e-01, 1.789e-01, 2.137e-01, -6.435e-01, -6.052e-01, 2.259e+00, 2.884e-02, 7.105e-04, 1.247e-01, -7.393e-02, 2.539e-02, 1.194e-01, 1.870e-01, -1.126e-01, 2.444e-02));
	r += mul(s0_8, M4(3.853e-02, -2.242e-01, 1.470e-01, 1.701e-02, 4.586e-02, 2.027e-01, 7.448e-01, -4.414e-01, 9.096e-03, 1.277e-01, 4.010e-02, 1.064e-02, 2.401e-02, 1.901e-02, 1.956e-02, 8.744e-02));
	r += mul(s1_0, M4(-4.741e-02, 1.819e-03, -8.321e-02, -1.496e-01, -1.801e-02, 4.682e-02, -6.041e-02, -7.243e-02, -1.478e-01, 4.970e-02, 6.424e-02, -5.378e-02, -9.117e-02, 5.496e-02, -2.648e-02, -4.042e-02));
	r += mul(s1_1, M4(-8.815e-02, 5.938e-02, -2.433e-01, 1.737e-01, 1.095e-01, -5.108e-02, -5.729e-02, 8.334e-03, -2.763e-01, -6.431e-02, -2.454e-02, 4.055e-01, 2.113e-02, -1.298e-01, -3.908e-02, -1.780e-02));
	r += mul(s1_2, M4(-1.905e-02, 3.894e-02, -1.293e-01, 8.303e-03, -7.800e-03, -5.508e-03, 8.606e-02, -7.501e-02, 1.542e-02, 3.046e-02, -2.920e-01, -4.240e-02, -3.932e-02, -1.813e-02, -8.213e-02, 1.017e-01));
	r += mul(s1_3, M4(1.965e-01, 3.626e-02, 3.418e-02, 9.779e-02, -6.664e-02, -2.295e-02, -2.736e-02, 1.091e-01, 1.129e-01, -3.896e-02, 1.171e-02, -2.870e-02, -1.382e-01, -1.691e-01, 3.018e-01, -1.186e-01));
	r += mul(s1_4, M4(1.075e-01, -6.894e-01, 1.714e-01, 5.097e-01, 9.868e-03, 1.087e-01, 2.107e-01, -6.591e-02, -3.233e-01, -9.792e-01, -1.189e-01, -5.480e-01, -1.157e-01, 5.941e-02, -5.770e-01, -1.030e-01));
	r += mul(s1_5, M4(3.289e-02, 3.941e-02, 1.824e-01, 7.260e-04, -9.787e-03, 3.128e-02, -1.333e-01, 1.352e-01, 5.954e-03, -2.520e-01, -8.536e-02, -3.566e-01, 2.998e-02, -5.941e-02, -8.531e-02, -4.232e-02));
	r += mul(s1_6, M4(2.592e-02, -7.528e-02, -1.956e-02, 1.002e-01, 2.992e-02, -1.673e-01, 4.413e-02, 1.683e-01, 1.440e-02, -1.047e-02, 1.425e-02, -1.292e-01, -1.777e-01, 1.220e-01, -6.381e-02, 4.174e-02));
	r += mul(s1_7, M4(-3.107e-02, -8.612e-02, 1.248e-02, -8.544e-02, -1.161e-01, 7.718e-02, -1.150e-01, -1.699e-01, -1.392e-02, 7.590e-02, -5.195e-02, -3.599e-01, 4.872e-02, 1.381e-01, -1.143e-01, -1.473e-03));
	r += mul(s1_8, M4(1.277e-02, 3.020e-02, 4.658e-02, 8.071e-02, 6.867e-02, -2.693e-02, 7.897e-02, -1.264e-02, -1.035e-03, 1.509e-01, 4.169e-02, -1.716e-01, -4.694e-03, 1.627e-02, 7.171e-03, -4.496e-02));
	r += V4(4.014e-03, -2.020e-02, 1.560e-02, -2.352e-02);
	return r;
}

void Pass4(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 5
//!DESC conv4
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0

#define l0(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(3.174e-02, -2.020e-01, -6.843e-03, 1.049e-01, 1.680e-01, -6.387e-01, -1.541e-01, -1.952e-01, -4.586e-02, -1.580e-01, -5.507e-02, 1.065e-01, -5.257e-03, -9.464e-02, -9.788e-02, 1.221e-01));
	r += mul(s0_1, M4(1.365e-01, -4.220e-02, -4.186e-02, -1.569e-01, -5.527e-01, -1.180e-01, -2.274e-01, -2.007e-01, 2.207e-02, 1.190e-02, 3.746e-02, -1.565e-01, -2.808e-02, 1.657e-02, -5.376e-02, -1.093e-02));
	r += mul(s0_2, M4(-7.935e-02, -3.809e-02, -3.727e-02, -4.730e-02, -8.556e-02, 3.451e-04, -8.191e-02, 8.086e-02, 2.051e-02, 7.072e-03, 2.537e-02, 2.793e-02, 9.384e-04, -3.624e-02, -2.171e-02, 7.103e-02));
	r += mul(s0_3, M4(-1.261e-02, 2.716e-01, 2.739e-01, -7.349e-02, -2.130e-02, -4.131e-01, -1.851e-01, 1.065e-01, -7.827e-02, 2.868e-01, -1.500e-01, -1.442e-01, -1.842e-02, -2.983e-01, -4.232e-02, 1.395e-01));
	r += mul(s0_4, M4(2.733e-01, 4.015e-01, 4.102e-01, -2.027e-01, 4.229e-01, 2.213e-01, 3.628e-01, -1.011e-01, -4.893e-01, 1.333e-01, -4.245e-02, -8.133e-02, -1.086e-02, -1.089e-01, -8.720e-02, 1.513e-01));
	r += mul(s0_5, M4(8.521e-02, 1.460e-01, 1.589e-01, -2.075e-01, -5.391e-02, 7.449e-03, -6.763e-02, -2.352e-01, 4.055e-02, -1.812e-02, -1.413e-02, 9.240e-02, -3.070e-02, -4.975e-03, -8.972e-02, -2.225e-02));
	r += mul(s0_6, M4(1.880e-01, -1.481e-01, 1.001e-01, 6.339e-02, -6.208e-02, -2.814e-02, -5.944e-03, 1.002e-01, -7.822e-02, 1.010e-01, -2.161e-02, 9.175e-02, 1.495e-02, 1.645e-02, 8.901e-03, -3.865e-02));
	r += mul(s0_7, M4(4.449e-01, -1.089e-01, -1.249e-01, -8.911e-01, 3.096e-02, 1.724e-01, 5.605e-02, -7.605e-02, -9.644e-02, -1.191e-01, -1.332e-01, 2.544e-02, 5.659e-02, -2.706e-04, -9.886e-02, 9.218e-02));
	r += mul(s0_8, M4(7.394e-02, -2.112e-01, 1.505e-02, -1.236e-01, -1.848e-02, -2.716e-02, -6.663e-02, 2.764e-02, -1.120e-02, 3.440e-03, -1.443e-02, 1.745e-02, -3.847e-02, -4.228e-03, -8.888e-02, 2.134e-02));
	r += mul(s1_0, M4(6.588e-03, -6.764e-02, -2.660e-02, -3.967e-02, 6.459e-02, -6.345e-01, -5.784e-01, 9.294e-02, 2.426e-02, -9.858e-02, -9.036e-02, -9.545e-02, 2.094e-02, -1.001e-01, -1.145e-01, -6.470e-02));
	r += mul(s1_1, M4(-2.633e-03, 5.849e-02, 3.154e-02, -7.386e-02, -6.412e-01, -4.405e-01, -5.885e-01, 1.657e-01, -1.757e-01, -1.882e-02, -1.023e-01, -1.713e-01, -1.047e-01, -1.558e-01, -1.509e-01, -2.815e-01));
	r += mul(s1_2, M4(1.880e-01, -3.790e-02, 1.112e-01, 1.672e-02, -1.713e-01, 2.611e-02, -9.008e-02, 9.359e-02, -6.567e-02, 9.399e-02, 3.743e-02, 3.662e-02, 3.190e-02, -1.466e-01, -1.154e-01, 1.692e-02));
	r += mul(s1_3, M4(-1.733e-02, 1.381e-01, 8.342e-02, -5.893e-02, -1.467e-02, -4.365e-01, -3.057e-01, 1.506e-01, 7.300e-02, 6.777e-01, -5.484e-03, -3.499e-01, 1.978e-01, -6.846e-01, -2.921e-01, -1.173e-01));
	r += mul(s1_4, M4(-1.829e-01, -4.506e-01, -5.685e-02, 8.260e-01, 3.056e-01, 1.803e-01, 1.908e-01, -2.029e-01, -1.578e-01, 5.039e-01, 3.016e-01, -4.971e-01, -4.977e-01, 4.537e-01, -4.268e-01, 7.878e-01));
	r += mul(s1_5, M4(-3.251e-01, -1.229e-01, -1.447e-01, 3.290e-01, -2.134e-01, -6.542e-03, -7.109e-02, -1.004e-01, 3.887e-02, -1.008e-01, -7.490e-02, 6.126e-02, 2.757e-01, -1.980e-01, -1.792e-01, 2.722e-01));
	r += mul(s1_6, M4(4.765e-02, -5.401e-02, 4.164e-02, 1.847e-03, -3.178e-02, -4.201e-02, -2.504e-02, 1.350e-02, -1.436e-01, 1.654e-01, -1.099e-02, -3.733e-02, 1.118e-01, -2.529e-01, -1.353e-01, -9.309e-02));
	r += mul(s1_7, M4(1.684e-01, -1.978e-01, 2.645e-02, -9.582e-02, 2.618e-02, 9.350e-02, -2.281e-02, -1.901e-01, 1.176e-02, -1.571e-01, 1.491e-02, -2.105e-01, -1.685e-01, -2.459e-01, -2.166e-01, 1.082e-01));
	r += mul(s1_8, M4(2.225e-02, 7.813e-02, -4.112e-02, 6.166e-02, -4.143e-02, -2.160e-02, -7.478e-02, -2.251e-02, -1.306e-02, -6.002e-02, -7.496e-02, -2.538e-03, 7.824e-02, 9.597e-02, -3.546e-03, -1.794e-01));
	r += V4(-5.942e-03, -2.718e-02, -1.234e-02, 3.307e-02);
	return r;
}

void Pass5(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 6
//!DESC conv5
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1

#define l0(x, y) V4(O(t0, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(-1.069e-01, 1.009e-01, -5.972e-02, -1.732e-02, -9.217e-02, 9.177e-03, -3.127e-02, -5.872e-02, -1.364e-02, -9.990e-04, 1.518e-01, 5.861e-02, -9.835e-02, -1.155e-01, 6.714e-02, -5.142e-02));
	r += mul(s0_1, M4(1.404e-02, 1.372e-01, -2.759e-01, -4.361e-02, -1.407e-01, 1.570e-01, -1.216e-01, -7.289e-02, 3.088e-01, -1.285e-01, 1.107e-01, 1.651e-01, 1.596e-01, -1.569e-01, 1.437e-02, -1.455e-01));
	r += mul(s0_2, M4(-4.001e-02, 1.772e-01, -2.761e-01, 4.916e-02, -1.489e-01, 1.680e-01, -5.244e-02, 1.334e-01, 1.245e-01, -2.321e-01, 5.371e-01, -2.549e-01, -9.624e-02, -1.072e-01, 2.322e-01, -2.261e-01));
	r += mul(s0_3, M4(-2.291e-01, 7.774e-04, -1.015e-02, 6.036e-02, -1.133e-01, 7.554e-02, 1.081e-01, 1.704e-01, 2.123e-01, -2.065e-01, 4.928e-02, 2.352e-03, -2.488e-01, -1.765e-01, 2.044e-01, 1.302e-02));
	r += mul(s0_4, M4(3.195e-01, -5.410e-01, -4.771e-01, -1.713e-01, 2.778e-01, -1.028e-01, 8.603e-02, 2.162e-01, 1.466e-02, 2.633e-02, -3.299e-01, -5.183e-02, -3.598e-01, -4.015e-01, 5.674e-02, -1.429e-01));
	r += mul(s0_5, M4(-1.480e-01, 2.440e-01, -2.189e-01, 1.407e-01, -3.439e-01, 2.624e-01, 4.947e-01, 7.813e-01, 1.067e-01, -6.781e-02, -5.271e-02, -1.331e-02, -2.133e-01, -1.038e-01, 4.267e-01, -4.026e-01));
	r += mul(s0_6, M4(-1.086e-01, 2.607e-01, -1.897e-01, -1.710e-01, 6.096e-02, -1.121e-01, 8.797e-02, -8.204e-02, 4.825e-02, -9.364e-02, 8.472e-02, -1.923e-02, -1.755e-01, 1.086e-01, -3.987e-02, 1.737e-02));
	r += mul(s0_7, M4(5.606e-02, 4.516e-02, -7.352e-02, 7.654e-02, -6.706e-02, 2.674e-01, -2.388e-01, -1.997e-01, 9.871e-02, -9.055e-02, 1.274e-01, 1.854e-01, -1.765e-01, -1.779e-01, 1.114e-01, -1.882e-01));
	r += mul(s0_8, M4(-4.811e-02, 2.057e-01, -2.913e-01, 1.265e-01, 1.304e-01, 1.462e-01, -4.432e-03, 4.191e-01, 6.606e-02, -1.382e-01, 1.052e-01, -3.990e-01, 9.737e-02, -9.675e-02, 6.216e-02, -2.130e-01));
	r += mul(s1_0, M4(-1.183e-01, -5.696e-02, 9.372e-02, 3.074e-03, -2.694e-02, -2.272e-02, -3.489e-02, -2.667e-02, 1.635e-01, -5.761e-04, -1.677e-03, -1.076e-01, -5.411e-02, -1.100e-02, 1.742e-02, 6.403e-02));
	r += mul(s1_1, M4(-4.462e-03, 3.912e-02, -1.208e-01, -9.360e-02, -1.260e-01, 1.602e-02, -1.047e-01, -1.252e-01, 2.940e-01, 1.068e-01, -2.602e-01, 1.692e-01, 1.120e-01, -2.613e-02, -1.083e-02, 1.754e-02));
	r += mul(s1_2, M4(2.307e-02, 1.240e-01, -2.024e-01, 1.761e-01, -2.326e-01, 3.209e-02, 5.352e-02, 3.399e-02, 1.754e-01, -3.059e-01, 4.554e-01, -2.412e-01, 4.242e-03, 3.919e-02, 7.769e-02, -1.155e-01));
	r += mul(s1_3, M4(-1.946e-01, -9.445e-02, 1.698e-01, 1.165e-01, -1.571e-01, 1.700e-02, 5.682e-02, 4.628e-02, 4.425e-01, -1.872e-01, 3.713e-02, 8.537e-02, 4.211e-02, -6.178e-02, 1.398e-02, 5.929e-02));
	r += mul(s1_4, M4(5.957e-01, -6.855e-01, -3.668e-01, -2.565e-01, -4.383e-02, -8.094e-02, -2.101e-02, -2.446e-01, -7.781e-02, 5.879e-01, -5.272e-01, -1.786e-01, -2.396e-01, -4.148e-01, 5.226e-02, 9.011e-02));
	r += mul(s1_5, M4(-4.655e-02, 1.107e-01, -1.109e-01, 3.601e-01, -2.103e-01, 3.712e-01, 1.666e-01, 3.972e-01, -2.227e-02, -2.115e-02, -7.054e-02, -1.216e-01, 4.739e-03, 1.201e-01, 1.335e-01, -1.775e-01));
	r += mul(s1_6, M4(-7.542e-02, 9.157e-02, 1.143e-02, -7.961e-02, -3.812e-02, 1.722e-02, 1.396e-02, -3.920e-02, -6.220e-03, -6.723e-02, 9.364e-02, -4.804e-02, -8.885e-02, 1.313e-01, -7.872e-02, 2.733e-02));
	r += mul(s1_7, M4(-3.879e-01, -2.705e-01, 3.305e-01, -1.542e-01, -1.179e-01, 9.695e-02, -1.353e-01, -2.320e-01, 1.433e-02, -2.689e-01, 2.066e-01, 3.704e-01, -5.587e-02, -6.296e-02, 6.326e-02, 1.881e-02));
	r += mul(s1_8, M4(-4.722e-02, -5.909e-02, 4.089e-02, -8.851e-02, 2.017e-01, -2.652e-02, 9.432e-02, 3.252e-01, -2.219e-01, 2.142e-02, -4.496e-02, 5.456e-02, 2.364e-02, 1.081e-01, -9.898e-02, 9.928e-02));
	r += V4(1.102e-03, 4.481e-03, 3.096e-03, -9.818e-03);
	return r;
}

void Pass6(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 7
//!DESC conv6
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0

#define l0(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(-3.488e-02, 3.507e-02, 3.848e-02, -5.906e-02, 9.669e-02, 3.121e-02, -2.182e-02, 1.691e-01, -1.132e-01, -7.602e-02, -5.000e-02, -6.017e-03, 3.962e-02, 1.086e-01, -3.343e-04, 9.002e-02));
	r += mul(s0_1, M4(9.453e-02, -1.793e-01, -6.074e-02, 5.317e-03, 1.056e-01, 3.460e-01, 5.291e-02, 7.825e-02, 5.510e-02, 4.818e-02, -1.119e-02, 3.913e-02, -8.177e-02, -1.060e-01, -9.989e-03, -9.245e-02));
	r += mul(s0_2, M4(-8.190e-02, 1.375e-01, -4.322e-02, -6.721e-02, 1.645e-02, -1.392e-01, 7.103e-02, -1.950e-02, 4.302e-03, -3.213e-02, -7.517e-03, -3.406e-03, -2.132e-02, 1.333e-01, -6.553e-02, 7.300e-02));
	r += mul(s0_3, M4(-1.102e-01, 3.005e-01, -8.521e-02, 3.002e-01, 1.866e-01, 1.089e-01, -2.968e-02, 1.271e-01, -3.566e-01, 1.224e-01, -7.462e-02, -2.765e-01, 5.175e-02, 1.567e-01, 1.450e-01, -1.948e-01));
	r += mul(s0_4, M4(1.558e-01, 3.780e-02, 9.697e-02, -2.485e-01, -3.560e-01, -3.667e-01, 1.396e-01, 1.020e+00, -2.319e-01, -2.878e-01, -2.849e-01, 5.648e-01, 2.094e-01, -5.684e-01, 1.482e-01, -6.172e-01));
	r += mul(s0_5, M4(-1.276e-01, -1.685e-01, 4.271e-01, -1.489e-01, 2.154e-01, 2.661e-01, -1.093e-01, -7.859e-02, 6.618e-02, 9.795e-02, 2.778e-02, -1.286e-01, -1.527e-01, -3.586e-01, 2.523e-01, 9.196e-02));
	r += mul(s0_6, M4(-1.354e-01, -6.680e-02, 5.541e-02, -5.314e-02, 1.639e-02, -1.639e-01, -1.856e-01, -1.863e-01, -1.519e-01, -5.459e-02, 1.027e-01, 6.492e-02, 3.482e-02, -9.074e-03, 1.861e-01, 1.393e-01));
	r += mul(s0_7, M4(1.907e-02, 1.189e-02, -5.038e-01, -8.478e-02, 3.643e-01, 1.086e-02, 3.067e-01, 1.071e-01, -6.552e-01, 1.505e-01, -7.394e-01, 1.155e-01, -1.815e-01, -1.739e-02, -2.723e-01, -1.607e-01));
	r += mul(s0_8, M4(-8.319e-02, -2.563e-02, -1.127e-01, -7.792e-02, 1.295e-01, 1.091e-01, 2.920e-02, -5.761e-02, -9.443e-02, 7.429e-03, -2.117e-01, -3.670e-02, -7.118e-02, -4.469e-02, -6.460e-02, -1.261e-02));
	r += mul(s1_0, M4(2.400e-02, -2.740e-02, -3.394e-02, 5.817e-02, -6.716e-02, -5.672e-02, -7.339e-02, -3.921e-02, -9.506e-02, -3.805e-02, -3.235e-02, -8.145e-02, 1.265e-02, 7.308e-02, -5.707e-02, 1.141e-01));
	r += mul(s1_1, M4(-1.565e-01, 1.052e-01, -8.934e-02, -6.945e-02, 3.804e-02, 2.091e-01, -1.102e-01, 2.394e-01, 6.041e-02, -9.942e-02, -6.054e-03, 4.857e-02, -7.265e-02, 1.596e-02, 9.135e-02, -8.397e-02));
	r += mul(s1_2, M4(-9.449e-02, 1.121e-01, -1.101e-01, -2.980e-02, 5.100e-02, -6.337e-02, 1.692e-01, -5.062e-02, -3.931e-02, 1.083e-01, 3.952e-03, 9.801e-04, -6.425e-02, 8.015e-02, -1.628e-01, 8.317e-02));
	r += mul(s1_3, M4(7.400e-02, 8.412e-02, 2.984e-02, 8.693e-02, -1.474e-01, -3.529e-02, -6.134e-02, -1.107e-01, -3.264e-01, 8.009e-02, -2.261e-01, -1.472e-01, -4.683e-02, -1.258e-01, 1.061e-01, -1.125e-01));
	r += mul(s1_4, M4(4.970e-01, -1.211e-01, 2.379e-01, 2.124e-01, -1.003e-01, -5.656e-01, 5.001e-02, 4.959e-01, 1.538e-01, -7.985e-01, -2.085e-01, 2.220e-01, 7.247e-02, 6.581e-02, -9.437e-02, -3.066e-01));
	r += mul(s1_5, M4(-8.611e-02, -9.199e-02, 2.518e-01, -7.482e-02, -1.208e-01, 1.015e-01, 3.428e-02, -1.354e-01, 1.038e-01, -4.497e-02, 2.744e-01, -4.281e-02, 4.090e-02, -2.726e-01, 1.839e-01, 1.138e-01));
	r += mul(s1_6, M4(-8.703e-02, -4.776e-02, -1.477e-01, -1.870e-02, -1.072e-01, 3.204e-02, -8.396e-03, 1.175e-01, 1.685e-01, -1.427e-01, 2.152e-01, -2.155e-01, 1.898e-03, 5.924e-02, -1.089e-02, 5.197e-02));
	r += mul(s1_7, M4(-9.679e-02, 1.961e-02, 1.636e-01, -6.049e-02, -7.071e-03, 1.519e-01, -6.303e-01, 4.739e-02, -3.331e-01, 8.291e-02, -5.944e-01, -7.677e-02, -1.164e-01, -4.580e-02, 1.419e-01, 6.839e-02));
	r += mul(s1_8, M4(6.174e-02, -4.004e-02, 1.256e-02, -4.981e-02, 4.659e-03, 8.371e-02, -1.664e-01, -2.897e-02, -1.253e-01, 2.381e-02, -1.147e-01, -8.724e-02, -3.736e-02, 1.140e-02, -1.550e-01, 6.350e-03));
	r += V4(-6.918e-03, -1.945e-03, -7.751e-03, 1.645e-02);
	return r;
}

void Pass7(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 8
//!DESC conv7
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1

#define l0(x, y) V4(O(t0, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(-4.116e-02, -3.385e-02, -4.697e-02, 4.650e-02, -3.488e-02, 1.006e-01, -4.538e-03, 4.637e-02, 1.288e-01, 7.769e-03, 1.150e-01, -7.930e-03, 1.045e-02, 4.849e-02, 2.767e-02, 4.909e-02));
	r += mul(s0_1, M4(-5.332e-01, -5.254e-01, -3.541e-01, -3.525e-01, 1.117e-02, 2.929e-02, 6.817e-02, 9.115e-02, 1.055e+00, 6.141e-02, 3.976e-01, 4.649e-02, 2.561e-01, -1.191e-01, 1.230e-03, -1.047e-01));
	r += mul(s0_2, M4(-1.450e-01, -2.829e-01, -6.857e-01, -4.294e-01, 3.217e-02, 2.745e-02, 5.242e-02, 3.556e-02, 1.284e-01, 4.292e-01, 7.161e-01, 2.220e-01, -1.508e-02, 1.802e-01, 1.842e-01, 9.827e-02));
	r += mul(s0_3, M4(1.381e-01, 5.690e-02, 5.107e-02, 6.625e-02, -1.173e-01, -7.448e-02, -1.152e-01, -1.808e-01, -1.470e-01, -1.833e-01, -1.653e-01, -1.217e-01, 9.096e-02, 5.579e-02, 1.128e-02, 9.791e-02));
	r += mul(s0_4, M4(5.936e-01, -2.579e-01, 5.761e-01, -7.051e-01, -7.023e-01, 2.824e-01, 2.057e-01, 3.628e-01, 1.006e-02, 3.209e-01, 6.969e-02, -3.464e-01, 4.768e-01, -3.194e-01, -4.817e-02, 3.050e-02));
	r += mul(s0_5, M4(2.145e-01, -1.899e-01, 1.446e-01, 2.497e-02, -8.750e-02, -3.154e-01, -5.060e-01, -7.413e-02, -8.542e-02, -4.198e-02, -1.528e-01, -1.812e-01, -2.597e-01, 8.374e-02, -5.592e-01, -2.557e-01));
	r += mul(s0_6, M4(5.713e-02, -4.294e-03, 2.388e-02, -7.124e-02, -2.163e-02, -3.642e-03, 3.839e-02, -6.934e-02, -9.052e-02, -1.153e-02, 1.213e-02, 7.120e-02, -3.698e-02, 4.260e-02, -7.245e-02, 7.898e-02));
	r += mul(s0_7, M4(2.780e-02, 1.944e-02, 1.415e-01, 1.216e-01, 9.163e-02, -3.069e-02, -1.829e-02, -2.182e-01, 5.815e-02, -1.923e-02, -5.934e-02, -3.487e-02, -1.082e-01, 1.362e-01, 8.120e-02, 2.621e-01));
	r += mul(s0_8, M4(9.334e-03, -1.300e-02, 4.936e-02, 1.751e-01, -1.214e-01, 1.629e-02, -1.131e-01, 7.402e-02, 1.134e-02, 1.663e-03, -5.887e-03, -8.862e-02, 1.029e-01, -5.629e-02, 9.127e-02, -6.668e-02));
	r += mul(s1_0, M4(1.165e-02, 4.389e-02, 6.299e-03, 7.939e-02, -2.769e-02, 9.353e-02, 6.239e-02, 1.341e-02, 4.713e-02, -2.731e-03, 5.256e-02, -3.515e-02, -8.911e-02, -1.425e-01, -7.889e-02, -1.627e-01));
	r += mul(s1_1, M4(-2.869e-01, 2.838e-02, -2.541e-02, 5.216e-02, 2.660e-01, -2.095e-01, 1.375e-01, -2.562e-02, 2.715e-01, 1.694e-01, 9.471e-02, -7.292e-03, 3.257e-01, -2.247e-01, 7.698e-03, -2.076e-01));
	r += mul(s1_2, M4(1.832e-02, -1.860e-01, -4.951e-02, -1.392e-03, 9.307e-02, 7.671e-02, 1.043e-01, -3.675e-02, 1.433e-03, 1.219e-01, 1.978e-01, 5.960e-02, 9.624e-02, 1.448e-01, 3.561e-01, 3.054e-02));
	r += mul(s1_3, M4(-4.647e-02, 4.225e-03, 3.830e-02, -3.233e-02, -1.532e-01, -6.289e-01, -3.037e-01, -4.131e-01, -1.794e-01, -4.090e-02, -9.644e-02, -4.828e-02, 7.978e-02, 6.792e-03, 5.043e-02, 4.905e-02));
	r += mul(s1_4, M4(2.967e-01, -5.750e-02, 1.168e-01, -2.681e-02, 1.232e-01, -2.481e-03, 8.164e-01, 2.468e-01, -3.721e-01, -5.041e-02, -4.796e-01, -2.778e-02, 3.623e-01, -8.387e-01, -5.229e-01, -4.492e-01));
	r += mul(s1_5, M4(-1.507e-02, 1.343e-01, 8.567e-02, 8.923e-02, -2.766e-03, -1.548e-01, -2.588e-01, -1.295e-01, 1.777e-02, -9.243e-02, -4.495e-02, -5.528e-02, -1.071e-01, -1.284e-01, -4.142e-01, -1.800e-01));
	r += mul(s1_6, M4(-4.695e-03, -9.431e-04, -1.256e-02, -4.959e-03, 1.607e-01, -8.763e-02, 2.039e-01, -1.243e-01, 3.725e-02, -5.612e-02, -3.615e-03, -2.475e-02, 4.955e-02, 4.065e-02, -1.879e-02, -1.195e-01));
	r += mul(s1_7, M4(-1.039e-02, 5.631e-02, 2.655e-02, 7.419e-02, 1.286e-01, -6.430e-02, 4.800e-02, -4.480e-02, 5.067e-03, -4.197e-02, -3.342e-02, -7.461e-02, -3.225e-02, 6.062e-03, 5.391e-02, -7.135e-02));
	r += mul(s1_8, M4(5.195e-02, 3.799e-02, 1.130e-01, -8.811e-03, -4.285e-02, 1.609e-02, -8.972e-03, 3.530e-02, -6.932e-02, -3.013e-03, -5.208e-02, 5.823e-02, -4.561e-02, -1.068e-01, -1.458e-01, -5.739e-02));
	r += V4(1.569e-02, 1.505e-02, 2.765e-02, 1.258e-02);
	return r;
}

void Pass8(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 9
//!DESC conv8
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0

#define l0(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(4.724e-03, 6.987e-03, -3.797e-03, 2.147e-02, -5.616e-03, 1.123e-02, -2.768e-02, 8.185e-03, -4.051e-03, 5.608e-05, -9.522e-02, 2.924e-02, -5.976e-03, 8.331e-03, 7.513e-02, -2.513e-02));
	r += mul(s0_1, M4(-2.224e-02, 1.093e-04, 5.901e-02, 2.350e-02, 1.167e-01, -7.837e-02, 1.939e-01, 1.987e-01, 5.530e-02, -4.759e-05, 1.221e-01, 4.764e-02, -8.813e-02, 7.695e-02, -4.577e-01, 1.671e-02));
	r += mul(s0_2, M4(-5.696e-02, 6.005e-03, -5.620e-02, -8.978e-02, 4.014e-02, -3.822e-02, 1.081e-01, -6.532e-03, 9.444e-03, 7.498e-03, -3.228e-02, 4.908e-02, -2.043e-02, 2.374e-02, 2.163e-02, -4.505e-02));
	r += mul(s0_3, M4(-8.098e-02, 1.943e-02, -5.744e-02, 3.824e-02, -2.071e-01, 1.036e-01, -6.926e-02, -2.348e-01, 2.378e-01, -1.069e-01, -5.307e-02, 1.161e-01, 1.881e-01, -5.785e-02, -6.570e-02, 2.227e-01));
	r += mul(s0_4, M4(7.577e-02, -4.125e-02, 1.714e-01, -6.934e-01, -2.448e-01, 1.146e-01, 2.354e-01, -4.935e-01, -2.321e-01, -8.273e-02, 5.890e-02, 5.704e-01, 4.833e-02, 2.875e-02, 1.163e-01, -1.802e-01));
	r += mul(s0_5, M4(2.287e-01, -3.461e-02, -2.542e-02, 2.882e-02, 7.142e-02, -1.556e-01, 4.055e-02, 1.534e-02, -1.647e-01, 3.087e-03, -6.811e-02, -3.896e-02, 1.334e-01, 1.188e-01, -1.847e-01, 4.293e-02));
	r += mul(s0_6, M4(-3.094e-02, 2.712e-03, 3.387e-03, 1.877e-02, 9.494e-02, -2.863e-02, -4.239e-02, -3.402e-02, 5.541e-03, -1.178e-02, 1.795e-02, -3.515e-02, -3.044e-02, -2.463e-02, -1.320e-02, 8.952e-02));
	r += mul(s0_7, M4(1.035e-01, -3.181e-02, 1.902e-02, 3.973e-03, 2.267e-01, -2.620e-01, 1.821e-01, 1.631e-01, 1.494e-02, 6.125e-02, -6.176e-02, -2.497e-02, -1.364e-02, 7.542e-02, -8.480e-02, -4.648e-02));
	r += mul(s0_8, M4(-1.466e-01, 3.028e-02, 2.798e-02, -7.887e-02, -4.370e-02, 1.408e-02, -6.161e-02, -3.034e-02, 6.567e-02, 2.071e-02, 3.126e-02, 6.993e-02, -5.556e-02, 1.507e-02, 2.991e-02, -4.924e-02));
	r += mul(s1_0, M4(1.637e-02, -2.767e-02, 8.568e-02, -4.254e-02, 3.215e-02, 1.987e-04, -3.697e-02, 3.787e-02, 2.236e-02, -6.576e-02, 7.400e-02, 1.093e-01, 3.271e-03, 1.809e-03, 1.011e-02, 1.509e-01));
	r += mul(s1_1, M4(5.538e-02, -5.865e-02, 4.351e-01, 2.494e-01, 1.101e-01, -1.484e-02, 5.176e-01, 3.999e-02, -4.782e-03, 1.155e-01, -2.099e-01, 5.012e-03, -1.919e-01, 2.292e-01, -5.378e-01, -1.223e-01));
	r += mul(s1_2, M4(-5.691e-02, 7.653e-02, -2.572e-01, -1.332e-01, -5.652e-02, -5.008e-02, 7.840e-02, -3.729e-02, 6.942e-02, 6.483e-04, -2.243e-05, 8.430e-02, -6.848e-02, -2.096e-02, -3.908e-02, -9.062e-02));
	r += mul(s1_3, M4(2.725e-01, -1.841e-01, -6.710e-03, 3.965e-01, -1.298e-01, -4.014e-03, 2.007e-01, -3.700e-01, 5.329e-01, -4.014e-01, 2.619e-02, 1.606e-01, 2.179e-01, -1.403e-01, 4.227e-02, 8.568e-02));
	r += mul(s1_4, M4(4.188e-01, -7.320e-01, -5.609e-01, -6.087e-01, -7.521e-01, 7.363e-01, -6.253e-01, -2.011e-01, -1.017e+00, 3.331e-02, -2.135e-02, 2.084e-01, 6.074e-01, -9.824e-01, 5.154e-01, 1.748e-01));
	r += mul(s1_5, M4(1.733e-01, 5.176e-01, -7.335e-02, 1.899e-02, 1.028e-01, -6.330e-02, -1.632e-01, 9.241e-05, -1.357e-01, -1.131e-01, 9.644e-02, -1.424e-02, -1.835e-02, 7.296e-01, -3.204e-01, -2.966e-02));
	r += mul(s1_6, M4(4.798e-02, -1.047e-01, 3.646e-02, 6.703e-02, 5.371e-02, 4.759e-02, -2.975e-02, -6.945e-02, 7.985e-02, -1.101e-01, 3.034e-02, -1.472e-02, -3.827e-02, 9.839e-03, -4.922e-03, 4.307e-03));
	r += mul(s1_7, M4(-8.950e-02, -1.253e-02, -1.730e-05, 3.862e-02, 2.692e-01, -4.645e-01, 2.399e-01, 2.744e-01, -4.503e-02, 1.724e-01, -7.935e-02, -5.200e-02, -2.132e-03, -1.926e-02, 2.926e-02, -2.288e-02));
	r += mul(s1_8, M4(-1.072e-01, -1.145e-02, 6.605e-03, -1.090e-01, -7.524e-03, 8.598e-02, -7.698e-02, -6.976e-02, 5.869e-02, -5.499e-02, 3.529e-02, 7.813e-02, -1.794e-01, 4.212e-02, -4.479e-03, -7.253e-02));
	r += V4(8.112e-05, 3.290e-03, -6.342e-04, 1.340e-02);
	return r;
}

void Pass9(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 10
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0
//!OUT OUTPUT

#define l0(x, y) V4(O(t0, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(-1.348e-01, -9.107e-02, -4.849e-02, 4.484e-04, -3.384e-02, -6.768e-02, -9.628e-03, -1.766e-02, -9.939e-03, -2.182e-02, -1.288e-02, 8.518e-03, 2.218e-02, -1.184e-03, 1.240e-03, 1.065e-02));
	r += mul(s0_1, M4(7.301e-02, 1.014e-01, -1.363e-02, -4.850e-02, -2.842e-01, -3.060e-02, -4.154e-02, 4.057e-03, -3.458e-02, -6.335e-02, -2.660e-02, -1.335e-02, -7.944e-03, 8.560e-03, 4.588e-02, 7.580e-03));
	r += mul(s0_2, M4(-9.349e-05, -2.142e-02, -1.258e-04, -9.330e-03, -5.058e-03, 3.912e-02, -2.976e-02, 2.410e-02, -8.512e-03, 4.954e-02, -2.093e-02, -2.582e-03, 1.648e-02, 7.942e-03, 1.520e-02, 3.414e-02));
	r += mul(s0_3, M4(-1.021e-01, -1.140e-01, 2.412e-01, 1.289e-02, -1.192e-01, -1.140e-01, 2.395e-01, 6.930e-03, -2.027e-01, -4.824e-02, 1.243e-01, 3.820e-03, 9.280e-03, 2.866e-02, 2.106e-02, 3.644e-03));
	r += mul(s0_4, M4(8.727e-02, 8.162e-02, 1.478e-01, 5.348e-01, -3.115e-01, -2.605e-02, 1.510e-01, 7.249e-01, -8.110e-02, -6.698e-01, 1.080e-01, -8.090e-02, -3.492e-01, -1.891e-01, -1.877e-01, -1.319e-01));
	r += mul(s0_5, M4(-5.622e-03, 2.237e-02, -4.008e-03, -1.980e-02, -1.837e-02, -3.311e-02, 4.289e-02, 3.256e-02, -2.178e-02, 2.653e-02, -1.722e-03, 8.373e-02, -8.042e-02, -2.962e-01, -4.643e-03, -6.865e-02));
	r += mul(s0_6, M4(6.248e-03, -2.320e-02, 1.883e-03, -1.430e-02, 1.224e-02, 5.634e-03, -1.964e-02, -1.627e-02, 2.010e-02, 1.174e-02, -3.919e-02, 9.559e-04, 3.016e-02, -2.836e-03, 7.667e-02, 3.552e-02));
	r += mul(s0_7, M4(-6.141e-03, 1.380e-02, 1.024e-02, -1.210e-02, 4.548e-02, 3.626e-02, -9.142e-02, -7.666e-02, -3.241e-02, -2.296e-02, -3.244e-02, -2.870e-01, 4.427e-02, 8.899e-02, -1.327e-01, 4.920e-02));
	r += mul(s0_8, M4(-2.801e-03, -9.930e-04, -2.770e-03, 1.623e-02, 2.158e-03, -1.258e-02, -3.089e-02, 3.211e-02, -9.620e-03, 1.776e-02, -4.337e-03, 4.676e-02, 1.130e-02, -7.436e-03, -3.572e-02, -1.742e-01));
	r += mul(s1_0, M4(-4.306e-02, -6.039e-02, -1.642e-02, -1.966e-02, -5.996e-02, -1.743e-01, -3.128e-02, 1.714e-02, -4.357e-03, -7.720e-03, -4.532e-03, 4.571e-03, 3.988e-02, 2.067e-02, 1.548e-02, -2.964e-04));
	r += mul(s1_1, M4(-6.070e-02, -9.324e-02, 7.472e-03, 2.173e-02, -7.996e-02, -5.139e-02, -5.545e-02, -1.891e-02, -1.767e-02, -1.527e-02, -2.906e-02, 1.310e-02, 2.594e-02, 7.495e-02, -7.681e-03, -4.678e-03));
	r += mul(s1_2, M4(4.883e-02, -3.167e-02, 2.862e-02, 3.357e-02, -8.454e-03, 9.265e-03, -1.657e-02, -8.086e-03, -1.170e-02, -3.549e-02, 7.437e-03, 1.425e-02, 1.441e-02, -1.961e-02, 1.560e-02, -1.122e-02));
	r += mul(s1_3, M4(-6.156e-02, -6.763e-02, 1.987e-01, 2.459e-02, -5.710e-02, -2.009e-01, 4.581e-01, -1.181e-02, -9.054e-02, -5.658e-02, 3.432e-02, 2.004e-02, 6.965e-03, -1.655e-02, 5.178e-03, -1.236e-02));
	r += mul(s1_4, M4(-1.301e-01, -5.093e-02, 7.676e-01, 6.003e-01, -9.216e-02, -2.228e-03, 7.034e-02, 1.851e-01, -5.318e-01, -1.852e-01, -2.980e-02, 1.919e-02, -8.147e-01, -1.773e-01, -2.675e-01, 2.314e-02));
	r += mul(s1_5, M4(-9.962e-03, -1.888e-01, -1.877e-02, 1.190e-01, -2.283e-02, -1.241e-02, 2.969e-04, 3.894e-02, -5.077e-02, 2.300e-01, -6.192e-02, 1.793e-01, 5.384e-03, -4.378e-01, 2.970e-02, -1.125e-01));
	r += mul(s1_6, M4(1.376e-02, -2.891e-03, 9.292e-03, -1.288e-03, 2.615e-02, 2.656e-02, -8.111e-02, -1.779e-02, -7.512e-03, 9.174e-03, -4.553e-02, -1.139e-02, 2.259e-02, 4.351e-03, 4.963e-02, 2.002e-02));
	r += mul(s1_7, M4(9.993e-03, 1.250e-02, -2.090e-02, 6.839e-03, 3.502e-03, 2.070e-03, -5.530e-02, -2.855e-03, 1.144e-02, -4.191e-02, -8.395e-02, -2.056e-01, 6.909e-02, 7.425e-02, -2.374e-01, 8.636e-02));
	r += mul(s1_8, M4(-1.762e-02, -3.804e-04, 2.643e-02, 4.383e-02, 1.748e-03, -1.201e-02, -8.452e-03, -1.216e-02, -1.203e-02, -3.454e-02, -1.957e-02, 2.212e-01, -1.375e-02, -1.094e-02, -1.245e-02, -2.124e-01));
	r += V4(3.107e-03, 3.655e-03, 5.416e-04, 5.397e-04);
	return tanh(r);
}

void Pass10(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
	uint2 size = GetOutputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = ((gxy >> 1) + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);

	static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
	static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
	float2 opt = float2(GetOutputPt());

	pos -= 0.5f * opt;
	float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);

	++gxy.x;
	pos.x += opt.x;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);

	++gxy.y;
	pos.y += opt.y;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);

	--gxy.x;
	pos.x -= opt.x;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
}
